# map the lat lon to US states
# library(maps)
library(mapdata)
## Loading required package: maps
library(sp)
library(raster)
## 
## Attaching package: 'raster'
## The following object is masked from 'package:skimr':
## 
##     bind
## The following object is masked from 'package:plotly':
## 
##     select
library(rworldmap)
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
dfWithIndicesCache = "../output/data.with.indices.Rdata"
if (!file.exists(dfWithIndicesCache)) {
  df <- fread("../output/50.2.umap.tsv.gz")
  df[, c("lat", "lon") := NULL]
  raw <- fread("../output/raw.tsv.gz")
  df = cbind(df, raw[, c("lat", "lon")])
  countriesSP <- getMap(resolution = 'low')
  pointsSP = SpatialPoints(df[, c("lon", "lat")], proj4string = CRS(proj4string(countriesSP)))
  indices = over(pointsSP, countriesSP)
  
  usaSP = raster::getData("GADM", country = "USA", level = 1)
  usaSP = spTransform(usaSP, CRS(proj4string(countriesSP)))
  indicesUSA = over(pointsSP, usaSP)
  
  df = cbind(df, indices)
  df = cbind(df, indicesUSA)
  save(df, file = dfWithIndicesCache)
  
}
load(dfWithIndicesCache)
metricTypes <- c("euclidean", "haversine")


# https://umap-learn.readthedocs.io/en/latest/embedding_space.html
# transformations to 2d
# x = np.sin(sphere_mapper.embedding_[:, 0]) * np.cos(sphere_mapper.embedding_[:, 1])
# y = np.sin(sphere_mapper.embedding_[:, 0]) * np.sin(sphere_mapper.embedding_[:, 1])
# z = np.cos(sphere_mapper.embedding_[:, 0])
#
# x = np.arctan2(x, y)
# y = -np.arccos(z)

summary

skim(df)
Data summary
Name df
Number of rows 4857144
Number of columns 100
Key NULL
_______________________
Column type frequency:
character 17
factor 36
numeric 45
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Activity 0 1.00 4 10 0 7 0
Name 0 1.00 2 3 0 2 0
Notes 0 1.00 0 8 4184952 15 0
UUID 0 1.00 16 36 0 63 0
Version 0 1.00 0 28 820090 30 0
Visit 0 1.00 0 467 4854672 2473 0
imgS3 0 1.00 0 50 4857004 141 0
GID_0 38323 0.99 3 3 0 1 0
NAME_0 38323 0.99 13 13 0 1 0
GID_1 38323 0.99 7 8 0 42 0
NAME_1 38323 0.99 4 14 0 42 0
VARNAME_1 38323 0.99 2 56 0 42 0
NL_NAME_1 4857144 0.00 NA NA 0 0 0
TYPE_1 38323 0.99 5 5 0 1 0
ENGTYPE_1 38323 0.99 5 5 0 1 0
CC_1 4857144 0.00 NA NA 0 0 0
HASC_1 38323 0.99 5 5 0 42 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
FeatureCla 83980 0.98 FALSE 1 Adm: 4773164, Adm: 0
SOVEREIGNT 83980 0.98 FALSE 12 Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028
SOV_A3 83980 0.98 FALSE 12 US1: 4749730, CZE: 6763, CAN: 4769, POL: 4028
TYPE 83980 0.98 FALSE 2 Cou: 4749774, Sov: 23390, Cou: 0, Dep: 0
ADMIN 83980 0.98 FALSE 12 Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028
ADM0_A3 83980 0.98 FALSE 12 USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028
GEOUNIT 83980 0.98 FALSE 12 Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028
GU_A3 83980 0.98 FALSE 12 USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028
SUBUNIT 83980 0.98 FALSE 12 Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028
SU_A3 83980 0.98 FALSE 12 USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028
NAME 83980 0.98 FALSE 12 Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028
ABBREV 83980 0.98 FALSE 12 U.S: 4749730, Cz.: 6763, Can: 4769, Pol: 4028
POSTAL 83980 0.98 FALSE 12 US: 4749730, CZ: 6763, CA: 4769, PL: 4028
NAME_FORMA 90912 0.98 FALSE 10 Uni: 4749730, Cze: 6763, Rep: 4028, Rep: 3272
TERR_ 4857144 0.00 FALSE 0 Ass: 0, Auz: 0, Chi: 0, Com: 0
NAME_SORT 83980 0.98 FALSE 12 Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028
FIPS_10_ 4857144 0.00 FALSE 0 FG: 0
ISO_A2 83980 0.98 FALSE 12 US: 4749730, CZ: 6763, CA: 4769, PL: 4028
ISO_A3 83980 0.98 FALSE 12 USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028
ISO3 83980 0.98 FALSE 12 USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028
ISO3.1 83980 0.98 FALSE 12 USA: 4749730, CZE: 6763, CAN: 4769, POL: 4028
ADMIN.1 83980 0.98 FALSE 12 Uni: 4749730, Cze: 6763, Can: 4769, Pol: 4028
REGION 83980 0.98 FALSE 3 Nor: 4754499, Eur: 15385, Sou: 3280, Afr: 0
continent 83980 0.98 FALSE 3 Nor: 4754499, Eur: 15385, Sou: 3280, Afr: 0
GEO3major 83980 0.98 FALSE 3 Nor: 4754499, Eur: 15385, Lat: 3280, Afr: 0
GEO3 83980 0.98 FALSE 6 US: 4749730, Cen: 12026, Can: 4769, Sou: 3280
IMAGE24 83980 0.98 FALSE 7 USA: 4749730, Cen: 12026, Can: 4769, Res: 3272
GLOCAF 83980 0.98 FALSE 6 USA: 4749730, Eur: 13222, Can: 4769, Res: 3272
Stern 83980 0.98 FALSE 3 Nor: 4754499, Eur: 15385, Sou: 3280, Aus: 0
SRESmajor 83980 0.98 FALSE 3 OEC: 4755695, REF: 14189, ALM: 3280, ASI: 0
SRES 83980 0.98 FALSE 5 Nor: 4754499, Cen: 12026, Lat: 3280, New: 2163
GBD 83980 0.98 FALSE 6 Nor: 4754499, Eur: 12026, Lat: 3272, Eur: 2163
AVOIDname 83980 0.98 FALSE 8 US: 4749730, Eur: 11301, Can: 4769, Pol: 4028
LDC 83980 0.98 FALSE 1 oth: 4773164, LDC: 0
SID 83980 0.98 FALSE 1 oth: 4773164, SID: 0
LLDC 83980 0.98 FALSE 1 oth: 4773164, LLD: 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Accuracy 0 1.00 6.04 1.59 -1.00 4.74 6.00 8.00 1.000000e+01 ▁▁▇▃▅
Elevation 0 1.00 309.11 308.89 -67.07 146.18 250.75 349.22 1.124989e+04 ▇▁▁▁▁
Heading 0 1.00 164.34 109.76 -1.00 70.66 165.92 259.03 3.600000e+02 ▇▆▇▆▆
Pressure 0 1.00 144.33 202.58 0.00 97.40 98.87 99.85 1.027210e+03 ▇▁▁▁▁
Speed 0 1.00 3.16 7.60 -1.00 0.00 0.00 1.39 4.989000e+01 ▇▁▁▁▁
UnixTime 0 1.00 1615763853.19 60327287.52 1525623850.00 1557692918.50 1622138392.50 1674732350.50 1.704206e+09 ▇▅▃▅▇
HeartRate 4842796 0.00 88.11 28.44 -1.00 76.00 85.00 100.00 1.650000e+02 ▁▁▇▂▁
Distance 4810634 0.01 68964.98 174017.51 2.00 5703.61 14004.90 45458.82 1.060107e+06 ▇▁▁▁▁
NumberOfSteps 4808496 0.01 123151.68 269954.22 4.00 7952.00 15226.00 67482.00 1.153645e+06 ▇▁▁▁▁
AverageActivePace 4822571 0.01 1.05 0.70 0.36 0.73 0.93 1.07 1.509000e+01 ▇▁▁▁▁
CurrentCadence 4822631 0.01 1.67 0.32 0.99 1.48 1.66 1.78 2.710000e+00 ▃▇▇▂▁
CurrentPace 4822631 0.01 0.87 0.20 0.26 0.79 0.87 1.02 1.300000e+00 ▁▂▇▆▂
FloorsAscended 4824429 0.01 14.84 14.30 1.00 3.00 10.00 16.00 5.800000e+01 ▇▆▁▁▁
FloorsDescended 4824118 0.01 11.68 10.49 1.00 5.00 11.00 15.00 4.100000e+01 ▇▆▁▁▁
vAccuracy 4844573 0.00 4.54 3.83 0.80 1.60 3.00 6.40 3.740000e+01 ▇▂▁▁▁
AccelerometerX 4850479 0.00 -0.82 4.54 -13.76 -4.08 -0.42 1.20 9.210000e+00 ▁▆▇▇▅
AccelerometerY 4850479 0.00 -0.45 3.44 -22.32 -0.97 -0.44 0.57 1.690000e+01 ▁▁▇▂▁
AccelerometerZ 4850479 0.00 3.84 7.15 -14.43 -1.25 6.99 9.78 1.337000e+01 ▁▃▁▅▇
ActivityConfidence 4850479 0.00 100.00 0.00 100.00 100.00 100.00 100.00 1.000000e+02 ▁▁▇▁▁
GyroscopeX 4850479 0.00 0.00 0.30 -3.86 -0.01 0.00 0.01 4.170000e+00 ▁▁▇▁▁
GyroscopeY 4850479 0.00 0.01 0.23 -2.83 -0.01 0.00 0.01 3.860000e+00 ▁▁▇▁▁
GyroscopeZ 4850479 0.00 0.00 0.14 -2.71 -0.01 0.00 0.01 1.990000e+00 ▁▁▇▁▁
UserAccelerometerX 4850479 0.00 0.00 0.76 -12.67 -0.15 0.00 0.13 8.760000e+00 ▁▁▇▂▁
UserAccelerometerY 4850479 0.00 -0.04 0.77 -13.89 -0.08 0.00 0.07 1.166000e+01 ▁▁▇▁▁
UserAccelerometerZ 4850479 0.00 0.02 0.80 -16.13 -0.13 0.05 0.19 8.570000e+00 ▁▁▁▇▁
Lightmeter 4855139 0.00 195.51 308.98 1.00 17.00 78.00 147.00 1.067000e+03 ▇▁▁▁▁
umap_euclidean0 0 1.00 4.47 14.10 -33.03 -3.59 4.47 12.56 4.191000e+01 ▁▃▇▃▁
umap_euclidean1 0 1.00 1.30 14.15 -36.53 -6.82 1.25 9.36 3.881000e+01 ▁▃▇▃▁
umap_haversine0 0 1.00 4.36 120.81 -855.47 -54.06 4.53 62.14 8.866400e+02 ▁▁▇▁▁
umap_haversine1 0 1.00 1.17 79.73 -568.70 -37.15 1.20 39.54 5.575500e+02 ▁▁▇▁▁
lat 0 1.00 42.53 4.99 -22.90 38.65 44.97 46.81 5.965000e+01 ▁▁▁▆▇
lon 0 1.00 -93.57 11.62 -158.23 -93.26 -92.08 -90.28 3.090000e+01 ▁▇▁▁▁
ScaleRank 83980 0.98 1.00 0.00 1.00 1.00 1.00 1.00 1.000000e+00 ▁▁▇▁▁
LabelRank 83980 0.98 1.00 0.00 1.00 1.00 1.00 1.00 1.000000e+00 ▁▁▇▁▁
ADM0_DIF 83980 0.98 0.00 0.00 0.00 0.00 0.00 0.00 0.000000e+00 ▁▁▇▁▁
LEVEL 83980 0.98 2.00 0.00 2.00 2.00 2.00 2.00 2.000000e+00 ▁▁▇▁▁
GEOU_DIF 83980 0.98 0.00 0.00 0.00 0.00 0.00 0.00 0.000000e+00 ▁▁▇▁▁
SU_DIF 83980 0.98 0.00 0.00 0.00 0.00 0.00 0.00 0.000000e+00 ▁▁▇▁▁
MAP_COLOR 83980 0.98 1.01 0.21 1.00 1.00 1.00 1.00 1.100000e+01 ▇▁▁▁▁
POP_EST 83980 0.98 305840704.97 19555866.99 4489409.00 307212123.00 307212123.00 307212123.00 3.072121e+08 ▁▁▁▁▇
GDP_MD_EST 83980 0.98 14192787.08 957328.31 82390.00 14260000.00 14260000.00 14260000.00 1.426000e+07 ▁▁▁▁▇
ISO_N3 83980 0.98 837.51 39.40 40.00 840.00 840.00 840.00 8.400000e+02 ▁▁▁▁▇
LON 83980 0.98 -98.75 6.73 -102.37 -99.14 -99.14 -99.14 3.138000e+01 ▇▁▁▁▁
LAT 83980 0.98 39.56 1.29 -10.84 39.53 39.53 39.53 6.284000e+01 ▁▁▁▇▁
AVOIDnumeric 83980 0.98 2.09 1.37 2.00 2.00 2.00 2.00 2.600000e+01 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
Time 0 1.00 2018-05-06 16:24:10 2024-01-02 14:31:47 2021-05-27 17:59:52 4853226
CurrentTripStart 4808480 0.01 2023-12-15 18:59:33 2023-12-31 18:01:01 2023-12-27 18:38:42 11

Cat umaps

for (metricType in metricTypes) {
  umap_0 = paste0("umap_", metricType, "0")
  umap_1 = paste0("umap_", metricType, "1")
  if (metricType == "haversine") {
    df$x <- sin(df[, ..umap_0]) * cos(df[, ..umap_1])
    df$y <- sin(df[, ..umap_0]) * sin(df[, ..umap_1])
    df$z <- cos(df[, ..umap_0])
    
    df$umap_0 <- atan2(df$x, df$y)
    df$umap_1 <- -acos(df$z)
  } else{
    df$umap_0 <- df[, ..umap_0]
    df$umap_1 <- df[, ..umap_1]
  }
  # base =
  p <-
    ggplot(df, aes(x = umap_0, y = umap_1, color = Activity)) + geom_point(alpha =
                                                                             0.5, size =
                                                                             size) + ggtitle(paste("Cat UMAP", metricType))
   p <-
    ggplot(df, aes(x = umap_0, y = umap_1, color = GU_A3)) + geom_point(alpha =
                                                                             0.5, size =
                                                                             size) + ggtitle(paste("Cat UMAP", metricType))
  # ignore the alpha in the legend
  p <-p + guides(color = guide_legend(override.aes = list(alpha = 1,size = 2)))
  print(p)
  
  # color by name
  p <-
    ggplot(df, aes(x = umap_0, y = umap_1, color = Name)) + geom_point(alpha =
                                                                         0.5, size =
                                                                         size) + ggtitle(paste("Cat UMAP", metricType))
  p <- p + guides(color = guide_legend(override.aes = list(alpha = 1,size = 2)))
  print(p)
  
}